# -*- coding: utf-8 -*-

# 抑制gensim警告，要在导入gensim前
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim import corpora
from gensim import models
from gensim.models import LdaModel
# 停用词
from nltk.corpus import stopwords
# 单词变体还原
from nltk.stem import WordNetLemmatizer
# 解析body域的HTML
from bs4 import BeautifulSoup
# 去除英文停用词，要先下载
from nltk.corpus import stopwords
# 去掉HTML标签
from HTMLParser import HTMLParser
# 去掉URL、数字
import re
# 去掉标点符号
import string
# 操纵MongoDB
import pymongo
# FIXING 乱码
import sys
# 不同主题数训练，映射词典
from collections import OrderedDict

# class to strip html
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return ''.join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# 过滤每一行内容，形成texts，作为dictionary生成的输入
def filter_lines(lines):
    texts = []
    for line in lines:
        # 去除标签，只保留内容
        stripped_line = strip_tags(line)

        # 去掉 URL
        stripped_line = re.sub(
            r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?]))',
            '', stripped_line)

        # 去掉空白字符
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        stripped_line = regex.sub(' ', stripped_line)

        # 去掉数字
        stripped_line = re.sub(" \d+", " ", stripped_line)

        # 去掉各种停用词，单词变体还原为动词形式（测试效果不是完全的）
        text = [lemmatizer.lemmatize(word, 'v') for word in stripped_line.lower().split() if word not in stop]
        if text:
            texts.append(text)

    return texts

def print_coherence_rankings(coherences):
    avg_coherence = \
        [(num_topics, avg_coherence)
         for num_topics, (_, avg_coherence) in coherences.items()]
    ranked = sorted(avg_coherence, key=lambda tup: tup[1], reverse=True)
    print("Ranked by average '%s' coherence:\n" % cm.coherence)
    for item in ranked:
        print("num_topics=%d:\t%.4f" % item)
    print("\nBest: %d" % ranked[0][0])
    return ranked[0][0]


if __name__ == '__main__':

    # Python2
    reload(sys)
    sys.setdefaultencoding('utf8')

    # 英文停用词
    stop = set(stopwords.words('english'))

    # 单词变体还原
    lemmatizer = WordNetLemmatizer()

    # MongoDB
    db = pymongo.MongoClient().question
    coll = db.question

    index = 0
    print("No. " + str(index))

    # 遍历Document
    for cursor in coll.find():
        _id = cursor['_id']
        title = cursor['title']
        body = cursor['body']

        soup = BeautifulSoup(body, "html.parser")
        # 去掉<code></code>
        for code in soup("code"):
            code.decompose()
        body = str(soup)

        lines = body.split('\n')
        # 标题也包含重要信息
        lines.append(title)

        # 处理词语问题
        texts = filter_lines(lines)
        # print(texts)

        # 生成词典
        dictionary = corpora.Dictionary(texts)

        # 根据词典，编号文档，生成向量形式
        corpus = [dictionary.doc2bow(text) for text in texts]
        mm_path = 'question.mm'
        corpora.MmCorpus.serialize(mm_path, corpus, id2word=dictionary)
        mm_corpus = corpora.MmCorpus(mm_path)  # load back in to use for LDA training

        # 训练模型，选取不同的主题数
        trained_models = OrderedDict()
        for num_topics in range(1, 10, 1):
            print("Training LDA(k=%d)" % num_topics)
            # LdaMulticore 使用多核CPU 并行化加速模型训练
            lda = models.LdaMulticore(
                mm_corpus, id2word=dictionary, num_topics=num_topics,
                passes=5, iterations=50, random_state=42, eval_every=None,
                alpha='asymmetric',  # shown to be better than symmetric in most cases
                decay=0.5, offset=64  # best params from Hoffman paper
            )
            trained_models[num_topics] = lda

        # 评估不同主题数的模型的一致性
        cm = models.CoherenceModel.for_models(
            trained_models.values(), dictionary, texts=texts, coherence='c_v')

        coherence_estimates = cm.compare_models(trained_models.values())
        coherences = dict(zip(trained_models.keys(), coherence_estimates))

        # 排序并获得最佳的主题数
        num_topics = print_coherence_rankings(coherences)

        # 形成该条document的结果对象
        result = {}
        result["num_topics"] = num_topics
        result["avg_coherence"] = coherences[num_topics][1]
        topics = []
        for topic_id, topic in trained_models[num_topics].show_topics(num_words=20, formatted=False):
            a_topic = {}
            a_topic["topic_id"] = topic_id
            a_topic["words"] = [word for word, _ in topic]

            topics.append(a_topic)

        result["topics"]=topics

        # 添加主题词 字段
        coll.update({"_id": _id}, {"$set": {"topic": result}})

        index = index + 1
        if index > 5:
            break
